package com.frogchou.pic.getmeitulu;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.util.Config;
import cn.edu.hfut.dmic.webcollector.util.FileUtils;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;

/**
 * 抓取www.meitulu.com 网站的高清美女图片 
 * @author frogchou
 * @create 2017-3-14
 */
public class ImageCrawler extends BreadthCrawler {
	// 用于保存图片的文件夹
	File downloadDir;
	public ImageCrawler(String crawlPath, String downloadPath) {
		super(crawlPath, true);
		downloadDir = new File(downloadPath);
		if (!downloadDir.exists()) {
			downloadDir.mkdirs();
		}
	}
	
	@Override
	public void visit(Page page, CrawlDatums links) {
		// 根据http头中的Content-Type信息来判断当前资源是网页还是图片
		String pageUrl=page.getUrl();
		if (pageUrl.startsWith("https://"))
		{pageUrl=pageUrl.substring(8);
		}
		if (pageUrl.startsWith("http://"))
		{pageUrl=pageUrl.substring(7);
		}
		String[] array=pageUrl.split("/");
		String folder="";
		for (int i = 0; i < array.length-1; i++) {
			folder+=array[i]+"/";
		}
		String contentType = page.getResponse().getContentType();
		if (contentType == null) {
			return;
		} else if (contentType.indexOf("html") > -1) {
			// 如果是网页，则抽取其中包含图片的URL，放入后续任务
			Elements imgs = page.getDoc().select("img[src]");
			
			Elements a=page.getDoc().getElementsByClass("title_czky");
			links.add(a.attr("abs:href"));
			
			for (Element img : imgs) {
				String imgSrc = img.attr("abs:src");
				if (imgSrc.indexOf("thumb") < 0) {
					links.add(imgSrc);
				}
			}
		} else if (contentType.startsWith("image")) {
			// 如果是图片，直接下载
			//String extensionName = contentType.split("/")[1];
			//String imageFileName = imageId.incrementAndGet() + "."+ extensionName;
			String imageFileName = array[array.length-1];
			File fullPath= new File(downloadDir.getAbsolutePath()+File.separator+folder);
			File imageFile = new File(fullPath, imageFileName);
			
			if (fullPath.exists())
			{
				createFile(imageFile);
				writeFile(imageFile, page.getContent());
				System.out.println("保存图片 " + page.getUrl() + " 到 "
						+ imageFile.getAbsolutePath());
			}else{
				fullPath.mkdirs();
				createFile(imageFile);
				writeFile(imageFile, page.getContent());
				System.out.println("保存图片 " + page.getUrl() + " 到 "
						+ imageFile.getAbsolutePath());
			}
		}
		
	}
	
	public void writeFile(File imageFile,byte[] content){
		try {
			FileUtils.writeFile(imageFile, content);
		} catch (IOException ex) {
			throw new RuntimeException(ex);
		}
		
	}
		
	public void createFile(File file){
		if (!file.exists())
			try {
				file.createNewFile();
			} catch (IOException e) {
				e.printStackTrace();
			}
	}

	public static void main(String[] args) throws Exception {
		
		 //String filePath=System.getProperty("user.dir");
		 
		ImageCrawler crawler = new ImageCrawler("D:\\spider\\log",
				"D:\\spider\\picture4");// 日志路径，抓取内容路径
		try {
			crawler.addSeed("http://www.meitulu.com/");// 抓取的网站
			String strReg = "http://www.meitulu.com/item/.*";// 抓取的深度
			crawler.addRegex(strReg);
			//设置爬虫是否为断点爬取  
	        crawler.setResumable(true);  
			crawler.setThreads(16);
			crawler.start(8);
		} catch (java.io.IOException e) {
		}

	}


}
